The Zero Copy Shared memory does save CPU/GPU data transfer time, but the programs that use that memory become more cpu intensive, and a lot of cpu intensive.
Is this normal? If not, how can I fix or optimize it.
void preprocess2gpu(const cv::Mat& srcImg, float* dstData, const int dstHeight, const int dstWidth, const cudaStream_t& preprocess_s)
{
int srcHeight = srcImg.rows;
int srcWidth = srcImg.cols;
int srcElements = srcHeight * srcWidth * 3;
int dstElements = dstHeight * dstWidth * 3;
cudaStreamAttachMemAsync(preprocess_s, srcDevData, 0, cudaMemAttachHost);
memcpy(srcDevData,srcImg.data,sizeof(uchar) * srcElements);
cudaStreamAttachMemAsync(preprocess_s, srcDevData, 0, cudaMemAttachGlobal);
// cudaMemcpy(srcDevData, srcImg.data, sizeof(uchar) * srcElements, cudaMemcpyHostToDevice);
// calculate width and height after resize
int w, h, x, y;
float r_w = dstWidth / (srcWidth * 1.0);
float r_h = dstHeight / (srcHeight * 1.0);
if (r_h > r_w) {
w = dstWidth;
h = r_w * srcHeight;
x = 0;
y = (dstHeight - h) / 2;
}
else {
w = r_h * srcWidth;
h = dstHeight;
x = (dstWidth - w) / 2;
y = 0;
}
dim3 blockSize(32, 32);
dim3 gridSize((dstWidth + blockSize.x - 1) / blockSize.x, (dstHeight + blockSize.y - 1) / blockSize.y);
// letterbox and resize
letterboxNorm<<<gridSize, blockSize, 0, preprocess_s>>>(srcDevData, srcHeight, srcWidth, midDevData, dstHeight, dstWidth, h, w, y, x);
process<<<gridSize, blockSize>>>(midDevData, dstData, dstHeight, dstWidth);
cudaStreamSynchronize(preprocess_s);
}
environment info:
TensorRT 8.5.2.2
cudnn 8.6.0.166
CUDA 11.4
ubuntu 20.04
jetson Xavier NX
ai mode: yolov8n-pose.engine