So I’ve decided to keep only 1 inference in cudaStream queue. It still should be faster than 3 execute()
in a row. Now I’m using this code (note that idea remains same: prepare and launch NN3’s enqueue()
, while it works launch NN2 and so on):
void ProcessCurrentFrame()
{
double time_span;
auto start = std::chrono::high_resolution_clock::now();
if (NN3->waitForResult()) // returns true when done
{
if (NN3->skipNext) // no license plate picture from NN2 so no frame to look for symbols
NN3->skipNext = false; // turn off for next run
else
NN3->copyOutputBuffers(); // only copying, processing in the end to not slow down other networks
}
if (NN2->waitForResult())
{
if (NN2->skipNext) // no objects from NN1
NN2->skipNext = false;
else
{
NN2->copyOutputBuffers();
// process output immidiately to use result in NN3
auto result = NN2->getResult();
// if at least one object detected
if (result.size() > 0)
{
// select biggest (= closest) one
cv::Rect2f biggestObject = getBiggestObject(result);
// offset from cropped object (result of NN1) coordinates to original frame coordinates
biggestObject += cv::Point2f(
NN2->used_rects.front().x,
NN2->used_rects.front().y
);
// void sendFrameToNN(cv::Mat frame, cv::Rect2f roi)
NN3->sendFrameToNN( // look for symbols
NN2->used_frames.front(), // stored original frame
biggestObject // ROI of the biggest detected object
);
}
else // no objects detected
NN3->skipNext = true;
}
NN2->used_frames.pop();
NN2->used_rects.pop();
}
// basically same as for NN2:
if (NN1->waitForResult())
{
NN1->copyOutputBuffers();
auto result = NN1->getResult();
if (result.size() > 0)
{
cv::Rect2f biggestObject = getBiggestObject(result);
NN2->sendFrameToNN( // look for license plate
NN1->used_frames.front(),
biggestObject
);
}
else
NN2->skipNext = true;
}
// look for vehicles
NN1->sendFrameToNN(currentFrame);
std::vector<DetectedObject> finalObjects = NN3->getResult();
// do things with result
// ...
time_span = std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::high_resolution_clock::now() - start).count();
avg = (avg * internal_counter + time_span) / (internal_counter + 1);
++internal_counter;
std::cout << time_span << " secs, " << avg << " secs avg\n";
}
And I’m getting same time as when I’m using execute()
.
I’m creating CUDA stream with flags, do copying data and call inference — all using async methods:
class NN
{
// ...
cudaStream_t fStream;
nvinfer1::IExecutionContext* fContext;
float** fOutBuffers; // store output of network;
void* bindings[3]; // pointers on GPU memory
nvinfer1::Dims fInputDims; // network input dimensions
nvinfer1::Dims fOutputDims[2]; // network output dimensions, 2 layers
NN()
{
// ...
CHECK(cudaMalloc(&bindings[0], fInputDims.d[0] * fInputDims.d[1] * fInputDims.d[2] * sizeof(float)));
CHECK(cudaMalloc(&bindings[1], fOutputDims[0].d[0] * fOutputDims[0].d[1] * fOutputDims[0].d[2] * sizeof(float)));
CHECK(cudaMalloc(&bindings[2], fOutputDims[1].d[0] * fOutputDims[1].d[1] * fOutputDims[1].d[2] * sizeof(float)));
cudaStreamCreateWithFlags(&fStream, cudaStreamNonBlocking);
}
void sendFrameToNN(cv::Mat &frame, cv::Rect2f &rect)
{
auto blob = convertImageToBlob(
cv::Mat(frame.clone(), rect),
fInputDims.d[1],
fInputDims.d[2],
false
);
CHECK(cudaMemcpyAsync(bindings[0], (void*)(blob.data()), fInputDims.d[0] * fInputDims.d[1] * fInputDims.d[2] * sizeof(float), cudaMemcpyHostToDevice, fStream));
fContext->enqueue(1, bindings, fStream, &inputIsFree);
// ...
}
bool waitForResult()
{
cudaStreamSynchronize(fStream);
return true;
}
void copyOutputBuffers()
{
CHECK(cudaMemcpyAsync(fOutBuffers[0], bindings[1], fOutputDims[0].d[0] * fOutputDims[0].d[1] * fOutputDims[0].d[2] * sizeof(float), cudaMemcpyDeviceToHost, fStream));
CHECK(cudaMemcpyAsync(fOutBuffers[1], bindings[2], fOutputDims[1].d[0] * fOutputDims[1].d[1] * fOutputDims[1].d[2] * sizeof(float), cudaMemcpyDeviceToHost, fStream));
}
// ...
}
And in NVVP it looks like all 3 NN use same stream (№14 to be exact).
Changing all this code to sync variants only leads to using Default CUDA stream instead of 14th.