Hello,
I would like to implement a parallel processing aspect to my program that captures images and performs a series of FFT’s and miscellaneous calculations on the GPU.
I would like to begin processing as soon as the image is copied over to the GPU, while the next image is being captured in parallel with the processing. The structure I want to implement looks something like (I apologize if this is confusing):
capture image1 → copy image1 to GPU → capture image2 -->_______copy to image2 to GPU →
_____________________________________Process image1 --------->
I have tested this version, but it is still slower than anticipated. Is this being performed correctly?
I am posting in this cuda forum because there may be a more effective way of accomplishing this using streams rather than creating a thread. I am new with all types of parallel programming, so any type of advice, resources,etc is welcome.
I have removed a large portion of code dealing with cuda error checking, and the details of each cuda kernel. This program is functional, the main area of concern is the capture ‘while’ loop.
#include <cufft.h>
#include <FlyCapture2.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_device_runtime_api.h>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <fstream>
#include <chrono>
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <thread>
#include "rt_nonfinite.h"
#include "get_peaks.h"
#include "main.h"
#include "get_peaks_terminate.h"
#include "get_peaks_initialize.h"
#define height 2048
#define width 2448
#define size 5013504
dim3 threadsPerBlock(32,32);
dim3 numBlocks(77,64);
using namespace FlyCapture2;
__global__ void datatransfer(cufftComplex *f2, float *f)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x>=0 && x<width && y>=0 && y< height) {
//Do something
...
}
}
__global__ void magnitude_kernel(cufftComplex *out, cufftComplex *in2)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x>=0 && x<width && y>=0 && y< height) {
//Do something
...
}
}
__global__ void swap_quadrants(cufftComplex *old_img,cufftComplex *new_img)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x>=0 && x<width/2 && y>=0 && y< height/2) {
//Do something
...
}
}
__global__ void transferpeakdata(float *row, cufftComplex *image)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
//Do something
...
}
void algorithm(float *d_a,
cufftComplex *a2,
cufftComplex *a3,
float *h_1d,
float *d_1d,
double *doub_1d,
double *peaks,
double *loc,
int *loc_sz,
int *peaks_sz,
cufftHandle plan)
{
//Conversion from float to float2
datatransfer<<<numBlocks,threadsPerBlock>>>(a2,d_a);
//First FFT
cufftExecC2C(plan,(cufftComplex *)a2,(cufftComplex *)a2, CUFFT_FORWARD)
//Perform absolute value
magnitude_kernel<<<numBlocks,threadsPerBlock>>>(a2,a3);
//Swap quadrants
swap_quadrants<<<numBlocks,threadsPerBlock>>>(a3,a2);
//Perform 2nd FFT
cufftExecC2C(plan,(cufftComplex *)a2,(cufftComplex *)a2, CUFFT_FORWARD)
//Perform 2nd absolute value
magnitude_kernel<<<numBlocks,threadsPerBlock>>>(a2,a3);
//Swap quadrants
swap_quadrants<<<numBlocks,threadsPerBlock>>>(a3,a2);
//Transfer Middle Two rows to a 1D array
transferpeakdata<<<3,1024>>>(d_1d,a2);
//Copy 1D array to host
cudaMemcpy(h_1d,d_1d,sizeof(float)*width,cudaMemcpyDeviceToHost);
//Better Peak method
int glob_max_ind;
int max_ind[2];
double max_num;
int distance;
get_peaks(doub_1d,peaks,peaks_sz,loc,loc_sz);
std::cout<<"Global max index: "<<max_ind[0]<<" Second max index: "<<max_ind[1]<<std::endl;
std::cout<<"Number of peaks: "<<peaks_sz[1]<<std::endl;
distance = abs(max_ind[0] - max_ind[1]);
std::cout<<"Distance: "<<distance<<std::endl;
}
int main()
{
using std::chrono::duration_cast;
using std::chrono::nanoseconds;
typedef std::chrono::high_resolution_clock clock;
//Cuda variables
cufftHandle plan;
cufftComplex *h_array2,*array2,*h_array3,*array3;
float *h_1darray,*dev_1darray,*dev_array;
double double_array[2448],peaks_array[2448],loc_data[2448];
int peaks_size[2],loc_size[2];
float *array = new float;
//Allocate memory
h_array2 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_array3 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_1darray = (float*)malloc(width*sizeof(float));
cudaMalloc((void **) &dev_array,sizeof(float)*size);
cudaMalloc((void **) &array2, sizeof(cufftComplex)*size);
cudaMalloc((void **) &array3, sizeof(cufftComplex)*size);
cudaMalloc((void **) &dev_1darray, sizeof(float)*width);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to allocate\n");
fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));
}
//Copy Variable to GPU
if (cudaMemcpy(array2,h_array2,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to array2"<<std::endl;
}
if (cudaMemcpy(array3,h_array3,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to array3"<<std::endl;
}
if (cudaMemcpy(dev_1darray,h_1darray,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to dev_1darray"<<std::endl;
}
//Plan 2D FFT
if (cufftPlan2d(&plan,height,width,CUFFT_C2C) != CUFFT_SUCCESS)
{
std::cout<<"Failed to make 1st FFT plan"<<std::endl;
}
Camera camera;
// Connect the camera
camera.Connect( 0 );
// Start capture
camera.StartCapture();
//Initialize loop
cv::Mat image = cv::Mat::ones(height,width,CV_32FC1);
cudaMemcpy(dev_array,array,sizeof(float)*size,cudaMemcpyHostToDevice);
//Capture Loop
int ii =0;
auto start = clock::now();
char key = 0;
while (key != 'q') {
//autofocus algorithm
std::thread t1(algorithm,dev_array,
array2,
array3,
h_1darray,
dev_1darray,
double_array,
peaks_array,
loc_data,
loc_size,
peaks_size,
plan);
// Get the image
Image rawImage;
error = camera.RetrieveBuffer( &rawImage );
if ( error != PGRERROR_OK )
{
std::cout << "capture error" << std::endl;
continue;
}
// convert to rgb
Image rgbImage;
rawImage.Convert( FlyCapture2::PIXEL_FORMAT_MONO8, &rgbImage );
// convert to OpenCV Mat
unsigned int rowBytes = (double)rgbImage.GetReceivedDataSize()/(double)rgbImage.GetRows();
image = cv::Mat(rgbImage.GetRows(), rgbImage.GetCols(), CV_8UC1, rgbImage.GetData(),rowBytes);
imshow("image",image);
//Convert
image.convertTo(image,CV_32FC1);
array = (float*)image.data;
//synchronize
t1.join();
//Copy Image Array to GPU
cudaMemcpy(dev_array,array,sizeof(float)*size,cudaMemcpyHostToDevice);
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to copy to dev_array\n");
fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));
}
key = cv::waitKey(3);
ii = ii+1;
}
//Finish timing the loop
cudaDeviceSynchronize();
auto end = clock::now();
std::cout <<"Loop Time: "<< (duration_cast<nanoseconds>(end-start).count())/(ii*1000000)<<std::endl;
cudaFree(array2);
cudaFree(array3);
cudaFree(dev_1darray);
cudaFree(dev_array);
return 0;
}