Hi, I am trying to do some image corrections on GPU for “N” number of images. There are 3 kernels that needs to be invoked for each image. Lets say I have 3 kernels(A,B,C). For a particular image, first I have to invoke kernel A & then B & C. Here these kernel executions should be synchronous i.e execution of kernel A should be completed before starting of B & similarly execution of B should be completed before starting C.
I am completely new to this cuda programming & I have started something with the below approach.
Kernels - A,B,C
Images - IMG1, IMG2, IMG3, IMG4, IMG5
I am following the below sequence for the executions
(IMG1 HostToDevice->IMG1 A->IMG1 B->IMG1 C-> IMG1 DeviceToHost->IMG2 HostToDevice->IMG2 A->IMG2 B->IMG C → IMG2 DeviceToHost & so on…).
All the above operations are synchronous & it is taking a lot of time because of this. Now when I searched about concurrent kernel executions & transfers I came to know about streams & I have tried to use cuda streams to make use of concurrent kernel executions & data transfers. But I didn’t see difference related to the times before & after using the streams. here is the code snippet that I am using
int main()
{
std::vector<cv::Mat> images; // assume that it has 5 images
int number_of_channels = 3;
int image_width = 1912;
int image_height = 1192;
int original_img_size = sizeof(float) * number_of_channels * image_width * image_height;
float ** device_original_images;
float ** device_processed_images;
float ** host_processed_images;
// Allocate memory in the CPU
host_processed_images = (float **)malloc(sizeof(float *) * 5);
// Allocate memory in the GPU
cudaMalloc(&m_device_original_images, sizeof(float *) * 5);
cudaMalloc(&m_device_processed_images, sizeof(float *) * 5);
for(int idx=0;idx<5;++idx)
{
host_processed_images[idx]= (float *)malloc(original_img_size);
float * a;
cudaMalloc(&a, original_img_size);
cudaMemcpy(&m_device_original_images[idx], &a, sizeof(float *),
cudaMemcpyHostToDevice);
float * b;
cudaMalloc(&b, original_img_size);
cudaMemcpy(&m_device_processed_images[idx], &b, sizeof(float *),
cudaMemcpyHostToDevice);
}
int nStreams = 5; // preparation for streams
cudaStream_t *streams = (cudaStream_t *)malloc(nStreams * sizeof(cudaStream_t));
for(int ii=0;ii<nStreams;ii++){
cudaStreamCreate(&streams[ii]);
}
// copy image data from cpu to gpu asynchronously for all the images
float ** temp_cpu_pointer_original_images = (float **)malloc(sizeof(float *));
float ** temp_cpu_pointer_processed_images = (float **)malloc(sizeof(float *));
// Allocates block size and grid size
dim3 threads_per_block(16, 16);
dim3 blocks_per_grid((int)ceil((image_width)/16) + 1,
(int)ceil((image_height)/16) + 1);
for(int idx=0;idx<5; ++idx)
{
temp_cpu_pointer_original_images[idx] = (float *)malloc(sizeof(float));
temp_cpu_pointer_processed_images[idx] = (float *)malloc(sizeof(float));
cudaMemcpy(&temp_cpu_pointer_original_images[idx], &m_device_original_images[idx], sizeof(float *), cudaMemcpyDeviceToHost);
cudaMemcpy(&temp_cpu_pointer_processed_images[idx], &m_device_processed_images[idx], sizeof(float *), cudaMemcpyDeviceToHost);
}
for(int idx=0; idx < 5; ++idx)
{
cudaMemcpyAsync(temp_cpu_pointer_original_images[idx], images[idx].data,
(sizeof(float) * image_width * image_height * number_of_channels),
cudaMemcpyHostToDevice, streams[idx]);
// Invoke Kernels
A<<<blocks_per_grid, threads_per_block, 0, streams[idx]>>>()
B<<<blocks_per_grid, threads_per_block, 0, streams[idx]>>>()
C<<<blocks_per_grid, threads_per_block, 0, streams[idx]>>>()
cudaMemcpyAsync(m_host_processed_images[idx], temp_cpu_pointer_processed_images[idx],
(sizeof(float) * image_width * image_height * number_of_channels),
cudaMemcpyDeviceToHost, streams[idx]);
}
for(int idx=0;idx<5;++idx)
{
cudaStreamSynchronize(streams[idx]);
}
for(int idx=0;idx<5;++idx)
{
cv::Mat final_image(image_height, image_width, CV_8UC3,(void *) m_host_processed_images[idx]);
images[idx] = final_image;
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err) << std::endl;
return;
}
}
With the above code snippet, I am not seeing any differences in the timings of before & after using of streams. Please let me know If I am doing anything wrong or is there any other way I can do these executions so that overall time will be lesser(something like a change in how I am storing the images or transferring). Thanks in advance!.
hardware being used: GeForce 3060