My application is written to do inference for a custom segmentation model. When I profiled the application using Nsight Systems, I found that the cudaMemCpy for device to host was taking around ~33ms whereas the host to device copy was under 1ms. I read some articles which suggested the use of cudaDeviceSynchronise() to seperate the kernel execution time. When I added that to the my code, the cudaMemCpy for device to host greatly reduced to be <1ms but the cudaDeviceSynchronise() was taking up almost 33ms now.
Here’s the timeline view after adding cudaDeviceSynchronise():
Here’s the code:
cv::minMaxLoc(cpu_img, &minVal, &maxVal, &minLoc, &maxLoc);
cpu_img.convertTo(cpu_img, CV_32FC1);
cpu_img = cpu_img / float(maxVal);
cudaMemcpyAsync(buffers[0], cpu_img.data, sizeof(float) * dl_input_size * dl_input_size, cudaMemcpyHostToDevice, stream);
context->enqueueV2(buffers.data(), stream, nullptr);
cudaDeviceSynchronise();
std::vector<float> cpu_output(getSizeByDim(dims[0]) * batch_size);
cudaMemcpyAsync(cpu_output.data(), (float*)gpu_output[1], cpu_output.size() * sizeof(float), cudaMemcpyDeviceToHost, stream);
std::vector<float> cpu_output_landmark(getSizeByDim(dims[1]) * batch_size);
cudaMemcpyAsync(cpu_output_landmark.data(), (float*)gpu_output[2], cpu_output_landmark.size() * sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);