I am using cudaMemcpy
in one thread, and it affects the execution of TensorRT streams in other models. I suspect it might be due to streams. The specific description is as follows:
-
In one thread, I use
cudaMemcpy
and NPP to perform YUV to RGB conversion. -
In another thread, there are TensorRT and other deep learning models.
The problem occurs when using `cudaMemcpy`, as it requires waiting for the execution of the TensorRT and other deep learning models to complete. How can I resolve this issue? I use `cudaMemcpy` during YUV to RGB conversion, and I noticed that `cudaMemcpy` becomes slow when loading deep learning models for inference. Additionally, I mentioned that these two processes are not directly related in the code, as the images used for inference are loaded separately and not directly linked to YUV to RGB conversion.
i use cudaMemcpyAsync and stream is same.
where i use :
virtual void OnImgNotify(const CMediaDataInfo& aDataInfo) override
{
double post = (double)cv::getTickCount();
printf(“img size: %dx%d, timestamp:%.3f, %d\n”, aDataInfo.width, aDataInfo.height, aDataInfo.timestamp, aDataInfo.linesize[0]);
printf(“%d\n”, aDataInfo.linesize[1]);//复制出一帧的YUV数据,这里假设为yuv420p的格式,并且step和图像的宽一样去复制 gMutexYUV.lock(); nvtxRangePushA("OnImgNotify"); // COLS= aDataInfo.width; // ROWS= aDataInfo.height; if(COLS!=aDataInfo.width||ROWS!=aDataInfo.height) std::cout<<"decode output yuv with different size of init input!"; linesize[0] = aDataInfo.linesize[0]; linesize[1] =aDataInfo.linesize[1]; linesize[2] =aDataInfo.linesize[2]; //需要再memcpy前,先显式将空间对其到主机端,否则再加载onnx模型后会导致错误的发生 cudaStreamAttachMemAsync(NULL,gYUV[0],0,cudaMemAttachGlobal); cudaStreamAttachMemAsync(NULL,gYUV[1],0,cudaMemAttachGlobal); cudaStreamAttachMemAsync(NULL,gYUV[2],0,cudaMemAttachGlobal); cudaMemcpyAsync(gYUV[0],aDataInfo.datas[0],COLS * ROWS ,cudaMemcpyHostToDevice,stream); cudaMemcpyAsync(gYUV[1],aDataInfo.datas[1],COLS * ROWS/4,cudaMemcpyHostToDevice,stream); cudaMemcpyAsync(gYUV[2],aDataInfo.datas[2],COLS * ROWS/4,cudaMemcpyHostToDevice,stream); cudaStreamSynchronize(stream); nvtxRangePop(); gMutexYUV.unlock(); post = (double)cv::getTickCount() - post; std::cout << "OnImgNotify time :" << post*1000.0 / cv::getTickFrequency() << " ms \n"; // YUV2BGRNpp(); }
where i init:
void initYuv2bgr(int w,int h)
{
//当前step等待后续给出,而不是自己计算
COLS =w;
ROWS =h;
//用于中转的yuv空间
cudaMallocManaged(&gYUV[0],COLS * ROWS);
cudaMallocManaged(&gYUV[1],COLS * ROWS/4);
cudaMallocManaged(&gYUV[2],COLS * ROWS/4);cudaMallocManaged(&manageBGR, COLS * ROWS * 3);
cudaStreamCreate(&stream);}