I want to use a pipelined algorithm where I can transfer a image to the GPU using one stream when the other stream is doing the computation task at CUDA kernel.
Based upon some suggestions, I have the following code which is running without any error or crash but it has two major problems. First of all, it is making the PC very slow and taking a lot of time. Secondly, I think that the resulting processed image is not proper. Sometimes, I think that there are some patches on it which means it could not be processed completely.
I am adding a completely working example of my code but it will need OpenCV libraries. Please let me know, if there is some conceptual problem with my code.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <opencv2/highgui.hpp>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
using namespace cv;
using namespace std;
RNG randg(123456);
const cv::Size IMG_SIZE(1024, 1024);
int cur_frame;
__global__ void proc_kernel(unsigned char *d_cur, int iw, int ih, int numChannels, unsigned char R_val, unsigned char G_val, unsigned char B_val)
{
// Calculate our pixel's location
int Row = blockIdx.y*blockDim.y + threadIdx.y;
int Column = blockIdx.x*blockDim.x + threadIdx.x;
// Operate only if we are in the correct boundaries
if(Column >= 0 && Column < iw && Row >= 0 && Row < ih)
{
d_cur[ numChannels* (iw*Row + Column) + 0 ] = (unsigned char)R_val;
d_cur[ numChannels* (iw*Row + Column) + 1 ] = (unsigned char)G_val;
d_cur[ numChannels* (iw*Row + Column) + 2 ] = (unsigned char)B_val;
}
}
bool captureImg(unsigned char *h_cur)
{
cur_frame++;
Mat img(IMG_SIZE, CV_8UC3);
img = cv::Scalar(randg.uniform(100,150),randg.uniform(150,200), randg.uniform(10,100));
imshow("Original image", img);
waitKey(33);
h_cur = img.data;
//cout<<(int)cur_frame<<". Frame captured"<<endl;
return false;
}
bool validate_image(unsigned char *img)
{
//Display processed image
Mat processedImg(IMG_SIZE, CV_8UC3);
unsigned char *ptr_processedImage;
ptr_processedImage = img;
processedImg.data = ptr_processedImage;
imshow("Processed Image", processedImg);
waitKey(33);
return true;
}
void CUDART_CB my_callback(cudaStream_t stream, cudaError_t status, void* data)
{
validate_image((unsigned char *)data);
}
int main()
{
bool done = false;
unsigned char *h_imgA, *h_imgB, *d_imgA, *d_imgB, *ptr_processedImage;
//Define a template image to initilize paramters for testing
Mat paramImg(IMG_SIZE, CV_8UC3);
paramImg = cv::Scalar(randg.uniform(150,250),randg.uniform(150,250), randg.uniform(150,250));
//Define an empty image to display the final processed image after the GPU processing
Mat processedImg(IMG_SIZE, CV_8UC3);
//Image parameters
size_t dsize = paramImg.channels() * paramImg.rows * paramImg.cols*sizeof(unsigned char);
int iw = paramImg.cols;
int ih = paramImg.rows;
int numChannels = paramImg.channels();
//Allocate buffer memory at host
//cudaHostAlloc(&h_imgA, dsize, cudaHostAllocDefault);
//cudaHostAlloc(&h_imgB, dsize, cudaHostAllocDefault);
//Allocate memories at Device
cudaMalloc(&d_imgA, dsize);
cudaMalloc(&d_imgB, dsize);
//Define Cuda streams
cudaStream_t st1, st2;
cudaStreamCreate(&st1);
cudaStreamCreate(&st2);
cudaStream_t *curst = &st1;
cudaStream_t *nxtst = &st2;
//Set the process
unsigned char *h_cur = h_imgA;
unsigned char *h_nxt = h_imgB;
unsigned char *d_cur = d_imgA;
unsigned char *d_nxt = d_imgB;
done = captureImg(h_cur);
//Set kernel dimensions
dim3 numBlocks(numChannels* (ceil(iw/16.0)), numChannels*(ceil(ih/16.0)));
dim3 numThreads(16,16);
//Start the process
while(!done)
{
unsigned char R_val = randg.uniform(10,100), G_val = randg.uniform(100,200), B_val = randg.uniform(200,250);
//Copy source image from host to device
cudaMemcpyAsync(d_cur, h_cur, dsize, cudaMemcpyHostToDevice, *curst);
//Call CUDA kernel
proc_kernel<<<numBlocks, numThreads, 0, *curst>>>(d_cur, iw, ih, numChannels, R_val, G_val, B_val);
//Copy current image back to Host from Device
cudaMemcpyAsync(h_cur, d_cur, dsize, cudaMemcpyDeviceToHost, *curst);
// insert a cuda stream callback here to copy the cur frame to output
cudaStreamAddCallback(*curst, &my_callback, (void *)h_cur, 0);
cudaStreamSynchronize(*nxtst); // prevent overrun
done = captureImg(h_nxt); // capture nxt image while GPU is processing cur
unsigned char *h_tmp = h_cur;
h_cur = h_nxt;
h_nxt = h_tmp;
d_cur = d_nxt;
d_nxt = h_tmp;
cudaStream_t *st_tmp = curst;
curst = nxtst;
nxtst = st_tmp;
}
return 0;
}